library(readr)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5     ✓ dplyr   1.0.8
✓ tibble  3.1.6     ✓ stringr 1.4.0
✓ tidyr   1.1.4     ✓ forcats 0.5.1
✓ purrr   0.3.4     
── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(gridExtra)

Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine
library(ggpubr)
library(ggplot2)
library(gganimate)
library(gifski)
CBB = read_csv("cbb.csv")
<<<<<<< Updated upstream
Rows: 2455 Columns: 24
── Column specification ─────────────────────────────
Delimiter: ","
chr  (3): TEAM, CONF, POSTSEASON
dbl (21): G, W, ADJOE, ADJDE, BARTHAG, EFG_O, EFG...
=======

Rows: 2455 Columns: 24
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (3): TEAM, CONF, POSTSEASON
dbl (21): G, W, ADJOE, ADJDE, BARTHAG, EFG_O, EFG_D, TOR, TORD, ORB, DRB, FTR, FTRD, 2P_O, 2P_D, 3P_O, 3P_D, ADJ_T,...
>>>>>>> Stashed changes

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(CBB)
<<<<<<< Updated upstream
 [1] "TEAM"       "CONF"       "G"         
 [4] "W"          "ADJOE"      "ADJDE"     
 [7] "BARTHAG"    "EFG_O"      "EFG_D"     
[10] "TOR"        "TORD"       "ORB"       
[13] "DRB"        "FTR"        "FTRD"      
[16] "2P_O"       "2P_D"       "3P_O"      
[19] "3P_D"       "ADJ_T"      "WAB"       
[22] "POSTSEASON" "SEED"       "YEAR"      
=======
 [1] "TEAM"       "CONF"       "G"          "W"          "ADJOE"      "ADJDE"      "BARTHAG"    "EFG_O"     
 [9] "EFG_D"      "TOR"        "TORD"       "ORB"        "DRB"        "FTR"        "FTRD"       "2P_O"      
[17] "2P_D"       "3P_O"       "3P_D"       "ADJ_T"      "WAB"        "POSTSEASON" "SEED"       "YEAR"      
>>>>>>> Stashed changes
CBB <- CBB %>% 
  rename(School=TEAM) %>% 
  rename(Conference=CONF) %>% 
  rename(GamesPlayed = G) %>% 
  rename(GamesWon = W) %>% 
  rename(AdjustedOffensiveEfficiency=ADJOE) %>% 
  rename(AdjustedDefensiveEfficiency=ADJDE) %>% 
  rename(PowerRating = BARTHAG) %>% 
  rename(EffectiveFieldGoalPercentageShot=EFG_O) %>% 
  rename(EffectiveFieldGoalPercentageAllowed=EFG_D) %>% 
  rename(TurnoverRate=TOR) %>% 
  rename(StealRate=TORD) %>% 
  rename(OffensiveReboundRate=ORB) %>% 
  rename(OffensiveReboundRateAllowed=DRB) %>% 
  rename(FreeThrowRate=FTR) %>%
  rename(FreeThrowRateAllowed = FTRD) %>% 
  rename(TwoPointShootingPercentage = "2P_O") %>% 
  rename(TwoPointShootingPercentageAllowed='2P_D') %>% 
  rename(ThreePointShootingPercentage='3P_O') %>% 
  rename(ThreePointShootingPercentageAllowed='3P_D') %>% 
  rename(AdjustedTempo=ADJ_T) %>% 
  rename(WinsAboveBubble=WAB) %>% 
  rename(Postseason=POSTSEASON) %>% 
  rename(Seed=SEED) %>% 
  rename(Season=YEAR)
q1 <- filter(CBB, !is.na(Postseason)) %>% 
  mutate(WinningPercentage = GamesWon / GamesPlayed)

q1$Postseason[q1$Postseason == "Champions"] <- 1
q1$Postseason[q1$Postseason == "2ND"] <- 2
q1$Postseason[q1$Postseason == "F4"] <- 3
q1$Postseason[q1$Postseason == "E8"] <- 4
q1$Postseason[q1$Postseason == "S16"] <- 5
q1$Postseason[q1$Postseason == "R32"] <- 6
q1$Postseason[q1$Postseason == "R64"] <- 7
q1$Postseason[q1$Postseason == "R68"] <- 8
q1b = q1[,c(3, 4, 25, 5:24)]
none = lm(Postseason~1, data=q1b)
full = lm(Postseason~., data=q1b)
stepAIC(none, scope=list(upper=full), direction="both", trace=FALSE)
<<<<<<< Updated upstream ======= >>>>>>> Stashed changes

Call:
lm(formula = Postseason ~ GamesPlayed + GamesWon + WinningPercentage + 
    PowerRating + AdjustedTempo + OffensiveReboundRateAllowed + 
    FreeThrowRate, data = q1b)

Coefficients:
<<<<<<< Updated upstream
                (Intercept)  
                   -6.67169  
                GamesPlayed  
                    0.43206  
                   GamesWon  
                   -1.09323  
          WinningPercentage  
                   33.98854  
                PowerRating  
                   -0.78871  
              AdjustedTempo  
                    0.02261  
OffensiveReboundRateAllowed  
                   -0.02875  
              FreeThrowRate  
                    0.01321  
======= (Intercept) GamesPlayed GamesWon WinningPercentage -6.67169 0.43206 -1.09323 33.98854 PowerRating AdjustedTempo OffensiveReboundRateAllowed FreeThrowRate -0.78871 0.02261 -0.02875 0.01321
>>>>>>> Stashed changes
plot(Postseason ~ GamesPlayed + GamesWon + WinningPercentage + PowerRating + AdjustedTempo + OffensiveReboundRateAllowed + FreeThrowRate, data = q1b)
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes

q1_model = lm(Postseason ~ GamesPlayed + GamesWon + WinningPercentage + PowerRating + AdjustedTempo + OffensiveReboundRateAllowed + FreeThrowRate, data = q1b)

abline(q1_model)
Warning in abline(q1_model) :
  only using the first two of 8 regression coefficients
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes
plot(q1_model)
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes
q2 <- q1[c(2,22)] 
q2$Conference <- as.factor(q2$Conference)
q2$Postseason <- as.numeric(q2$Postseason)

boxplot(Postseason~factor(Conference), data=q2)

means = tapply(q2$Postseason, q2$Conference, mean)
points(means, col="red", pch=18)
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes
q2_anova = aov(Postseason~factor(Conference), data=q2)
q2_anova
Call:
   aov(formula = Postseason ~ factor(Conference), data = q2)

Terms:
                factor(Conference) Residuals
Sum of Squares            205.2532  703.4086
Deg. of Freedom                 31       444

Residual standard error: 1.258671
Estimated effects may be unbalanced
plot(q2_anova)

# q2_anova plot satisfies criteria
summary(q2_anova)
                    Df Sum Sq Mean Sq F value           Pr(>F)    
factor(Conference)  31  205.3   6.621   4.179 0.00000000000583 ***
Residuals          444  703.4   1.584                             
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# P-value is low -> reject the null (that all means are the same), which means that the groups are different
# See which groups are the most different
pairwise.t.test(q2$Postseason, q2$Conference, p.adj="none")
q2_plot = TukeyHSD(q2_anova)
plot(q2_plot)


q3<- ggplot(data=CBB) +
  geom_point(aes(x=TurnoverRate,y=GamesWon)) +
  geom_smooth(aes(x=TurnoverRate,y=GamesWon)) + 
  xlab("Turnover Rate")+ylab("Games Won")

q3
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes

q4<-ggplot(data=filter(CBB,Conference=="ACC")) +
  geom_point(aes(x=PowerRating,y= GamesWon/GamesPlayed))+
  geom_smooth(aes(x=PowerRating,y= GamesWon/GamesPlayed))+
  ggtitle("Power Rating Efficiency in the ACC")+
  xlab("Power Rating") +ylab("Average Win")

q4
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes
q5 = ggplot(data=CBB) +
  geom_point(aes(x=FreeThrowRate,y=GamesWon)) +
  xlab("Free Throw Percentage")+ylab("Games Won") + geom_smooth(aes(x=FreeThrowRate,y=GamesWon))
q5
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes
q6 = ggplot(data=CBB) +
  geom_point(aes(x=AdjustedOffensiveEfficiency,y= GamesWon / GamesPlayed)) +
  xlab("Offensive Efficiency")+ylab("Winning Percentage") + geom_smooth(aes(x=AdjustedOffensiveEfficiency,y= GamesWon / GamesPlayed))
q6
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes
#Plots for q7
q7plot.1 = ggplot(data=CBB, aes(x=AdjustedOffensiveEfficiency, y=PowerRating)) +
  geom_point()

q8plot.2 = ggplot(data=CBB, aes(x=AdjustedDefensiveEfficiency, y=PowerRating)) +
  geom_point()

grid.arrange(q7plot.1, q8plot.2, ncol=2)
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes
#Plot for q8
q8plot.1 = CBB %>%
  group_by(Conference) %>%
  summarise(Count = n(), GamesWonAvg = mean(GamesWon)) %>%
  arrange(desc(Count)) %>%
  ggplot(aes(x=Conference, y=GamesWonAvg)) +
    geom_bar(stat='identity')

q8plot.1
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes
#Plot for q9
q9 <- ggplot(data=CBB, aes(x=TwoPointShootingPercentageAllowed,y=ThreePointShootingPercentageAllowed)) + 
  geom_point()+
  stat_regline_equation(label.y = 40, aes(label = ..rr.label..))
<<<<<<< Updated upstream
Error in stat_regline_equation(label.y = 40, aes(label = ..rr.label..)) : 
  could not find function "stat_regline_equation"
=======

>>>>>>> Stashed changes
#Plot for q10
q10.1 <- ggplot(data=CBB, aes(x=ThreePointShootingPercentage,y=AdjustedOffensiveEfficiency)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  stat_regline_equation(label.y = 120, aes(label = ..rr.label..))
<<<<<<< Updated upstream
Error in stat_regline_equation(label.y = 120, aes(label = ..rr.label..)) : 
  could not find function "stat_regline_equation"
=======
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

>>>>>>> Stashed changes
q11<-ggplot(data=CBB) +
  geom_point(aes(x=ThreePointShootingPercentage,y= GamesWon))+
  geom_smooth(aes(x=ThreePointShootingPercentage,y= GamesWon))+
  ggtitle("Correlation Between 3P Shooting Percentage and Games Won")+
  xlab("3P Shooting %") +ylab("Games Won")
q11
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
<<<<<<< Updated upstream

=======

>>>>>>> Stashed changes
q12<-ggplot(data=CBB) +
  geom_point(aes(x=TwoPointShootingPercentageAllowed,y= FreeThrowRateAllowed))+
  geom_smooth(aes(x=TwoPointShootingPercentageAllowed,y= FreeThrowRateAllowed))+
  ggtitle("Correlation Between Allowed 2pt Shooting % and Free Throw Rate Allowed")+
  xlab("2pt Shot Shooting Percentage Allowed") +ylab("Free Throw Rate Alowed")
q12
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
<<<<<<< Updated upstream

Possible graph for question 4: Do more wins over bubble teams lead to higher seeds in March Madness?

Bubblewins = ggplot(data=CBB) +
  geom_point(aes(x=WinsAboveBubble,y=Seed, color = Conference)) +
  xlab("Wins Above Bubble") + ylab("Seed") + geom_smooth(aes(x=WinsAboveBubble, y = Seed))
Bubblewins
=======

#Code for follow-up q3

ggplot(q1, aes(x=Season, y=AdjustedOffensiveEfficiency, group=Conference, color=Conference)) +
  geom_line() 

  #transition_reveal(Season)
mod1 = lm(AdjustedOffensiveEfficiency~Season+Conference, data=q1)
summary(mod1)

Call:
lm(formula = AdjustedOffensiveEfficiency ~ Season + Conference, 
    data = q1)

Residuals:
     Min       1Q   Median       3Q      Max 
-11.7993  -2.9730  -0.0997   3.0028  13.8258 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -310.4006   211.9532  -1.464 0.143773    
Season            0.2086     0.1052   1.984 0.047902 *  
ConferenceACC     6.3718     1.1199   5.690 2.32e-08 ***
ConferenceAE     -5.5240     1.9345  -2.855 0.004499 ** 
ConferenceAmer    2.2932     1.3812   1.660 0.097569 .  
ConferenceASun   -4.7269     2.0556  -2.300 0.021938 *  
ConferenceB10     5.3016     1.1251   4.712 3.29e-06 ***
ConferenceB12     4.9407     1.1299   4.373 1.53e-05 ***
ConferenceBE      4.0903     1.1536   3.546 0.000433 ***
ConferenceBSky   -5.1240     1.9345  -2.649 0.008369 ** 
ConferenceBSth   -7.0219     1.8395  -3.817 0.000154 ***
ConferenceBW     -6.6240     1.9345  -3.424 0.000674 ***
ConferenceCAA    -1.7097     1.9345  -0.884 0.377283    
ConferenceCUSA   -4.8097     1.9345  -2.486 0.013277 *  
ConferenceHorz   -4.9383     1.9345  -2.553 0.011023 *  
ConferenceIvy    -3.0383     1.9345  -1.571 0.116998    
ConferenceMAAC   -2.8097     1.9345  -1.452 0.147095    
ConferenceMAC    -2.3955     1.9345  -1.238 0.216276    
ConferenceMEAC  -12.1955     1.9345  -6.304 7.02e-10 ***
ConferenceMVC     2.0923     1.6919   1.237 0.216889    
ConferenceMWC    -0.2815     1.4476  -0.194 0.845912    
ConferenceNEC    -6.7240     1.9345  -3.476 0.000560 ***
ConferenceOVC    -1.0844     1.8395  -0.590 0.555826    
ConferenceP12     2.8533     1.2072   2.364 0.018527 *  
ConferencePat    -4.6955     1.9345  -2.427 0.015615 *  
ConferenceSB     -3.0029     1.8368  -1.635 0.102790    
ConferenceSC     -3.7097     1.9345  -1.918 0.055799 .  
ConferenceSEC     4.3603     1.1985   3.638 0.000307 ***
ConferenceSlnd   -4.1240     1.9345  -2.132 0.033574 *  
ConferenceSum    -1.1955     1.9345  -0.618 0.536922    
ConferenceSWAC   -8.8669     1.9345  -4.583 5.96e-06 ***
ConferenceWAC    -3.1240     1.9345  -1.615 0.107050    
ConferenceWCC     6.9869     1.5886   4.398 1.37e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.522 on 443 degrees of freedom
Multiple R-squared:  0.5283,    Adjusted R-squared:  0.4942 
F-statistic:  15.5 on 32 and 443 DF,  p-value: < 2.2e-16
plot(AdjustedOffensiveEfficiency~Season+Conference, data=q1)
>>>>>>> Stashed changes
Warning in xy.coords(x, y, xlabel, ylabel, log) :
  NAs introduced by coercion
Warning in min(x) : no non-missing arguments to min; returning Inf
Warning in max(x) : no non-missing arguments to max; returning -Inf

Error in plot.window(...) : need finite 'xlim' values

>>>>>>> 32a90b4d8188f3c752b6b224b5b5e8b9a8e128d0

<<<<<<< Updated upstream
LS0tCnRpdGxlOiAiRURBIERhdGEgQ2xlYW51cCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkocmVhZHIpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGdyaWRFeHRyYSkKYGBgCgpgYGB7cn0KQ0JCID0gcmVhZF9jc3YoImNiYi5jc3YiKQpuYW1lcyhDQkIpCkNCQiA8LSBDQkIgJT4lIAogIHJlbmFtZShTY2hvb2w9VEVBTSkgJT4lIAogIHJlbmFtZShDb25mZXJlbmNlPUNPTkYpICU+JSAKICByZW5hbWUoR2FtZXNQbGF5ZWQgPSBHKSAlPiUgCiAgcmVuYW1lKEdhbWVzV29uID0gVykgJT4lIAogIHJlbmFtZShBZGp1c3RlZE9mZmVuc2l2ZUVmZmljaWVuY3k9QURKT0UpICU+JSAKICByZW5hbWUoQWRqdXN0ZWREZWZlbnNpdmVFZmZpY2llbmN5PUFESkRFKSAlPiUgCiAgcmVuYW1lKFBvd2VyUmF0aW5nID0gQkFSVEhBRykgJT4lIAogIHJlbmFtZShFZmZlY3RpdmVGaWVsZEdvYWxQZXJjZW50YWdlU2hvdD1FRkdfTykgJT4lIAogIHJlbmFtZShFZmZlY3RpdmVGaWVsZEdvYWxQZXJjZW50YWdlQWxsb3dlZD1FRkdfRCkgJT4lIAogIHJlbmFtZShUdXJub3ZlclJhdGU9VE9SKSAlPiUgCiAgcmVuYW1lKFN0ZWFsUmF0ZT1UT1JEKSAlPiUgCiAgcmVuYW1lKE9mZmVuc2l2ZVJlYm91bmRSYXRlPU9SQikgJT4lIAogIHJlbmFtZShPZmZlbnNpdmVSZWJvdW5kUmF0ZUFsbG93ZWQ9RFJCKSAlPiUgCiAgcmVuYW1lKEZyZWVUaHJvd1JhdGU9RlRSKSAlPiUKICByZW5hbWUoRnJlZVRocm93UmF0ZUFsbG93ZWQgPSBGVFJEKSAlPiUgCiAgcmVuYW1lKFR3b1BvaW50U2hvb3RpbmdQZXJjZW50YWdlID0gIjJQX08iKSAlPiUgCiAgcmVuYW1lKFR3b1BvaW50U2hvb3RpbmdQZXJjZW50YWdlQWxsb3dlZD0nMlBfRCcpICU+JSAKICByZW5hbWUoVGhyZWVQb2ludFNob290aW5nUGVyY2VudGFnZT0nM1BfTycpICU+JSAKICByZW5hbWUoVGhyZWVQb2ludFNob290aW5nUGVyY2VudGFnZUFsbG93ZWQ9JzNQX0QnKSAlPiUgCiAgcmVuYW1lKEFkanVzdGVkVGVtcG89QURKX1QpICU+JSAKICByZW5hbWUoV2luc0Fib3ZlQnViYmxlPVdBQikgJT4lIAogIHJlbmFtZShQb3N0c2Vhc29uPVBPU1RTRUFTT04pICU+JSAKICByZW5hbWUoU2VlZD1TRUVEKSAlPiUgCiAgcmVuYW1lKFNlYXNvbj1ZRUFSKQoKYGBgCgpgYGB7cn0KcTEgPC0gZmlsdGVyKENCQiwgIWlzLm5hKFBvc3RzZWFzb24pKSAlPiUgCiAgbXV0YXRlKFdpbm5pbmdQZXJjZW50YWdlID0gR2FtZXNXb24gLyBHYW1lc1BsYXllZCkKCnExJFBvc3RzZWFzb25bcTEkUG9zdHNlYXNvbiA9PSAiQ2hhbXBpb25zIl0gPC0gMQpxMSRQb3N0c2Vhc29uW3ExJFBvc3RzZWFzb24gPT0gIjJORCJdIDwtIDIKcTEkUG9zdHNlYXNvbltxMSRQb3N0c2Vhc29uID09ICJGNCJdIDwtIDMKcTEkUG9zdHNlYXNvbltxMSRQb3N0c2Vhc29uID09ICJFOCJdIDwtIDQKcTEkUG9zdHNlYXNvbltxMSRQb3N0c2Vhc29uID09ICJTMTYiXSA8LSA1CnExJFBvc3RzZWFzb25bcTEkUG9zdHNlYXNvbiA9PSAiUjMyIl0gPC0gNgpxMSRQb3N0c2Vhc29uW3ExJFBvc3RzZWFzb24gPT0gIlI2NCJdIDwtIDcKcTEkUG9zdHNlYXNvbltxMSRQb3N0c2Vhc29uID09ICJSNjgiXSA8LSA4CmBgYAoKYGBge3J9CnExYiA9IHExWyxjKDMsIDQsIDI1LCA1OjI0KV0Kbm9uZSA9IGxtKFBvc3RzZWFzb25+MSwgZGF0YT1xMWIpCmZ1bGwgPSBsbShQb3N0c2Vhc29ufi4sIGRhdGE9cTFiKQpzdGVwQUlDKG5vbmUsIHNjb3BlPWxpc3QodXBwZXI9ZnVsbCksIGRpcmVjdGlvbj0iYm90aCIsIHRyYWNlPUZBTFNFKQpgYGAKCmBgYHtyfQpwbG90KFBvc3RzZWFzb24gfiBHYW1lc1BsYXllZCArIEdhbWVzV29uICsgV2lubmluZ1BlcmNlbnRhZ2UgKyBQb3dlclJhdGluZyArIEFkanVzdGVkVGVtcG8gKyBPZmZlbnNpdmVSZWJvdW5kUmF0ZUFsbG93ZWQgKyBGcmVlVGhyb3dSYXRlLCBkYXRhID0gcTFiKQoKcTFfbW9kZWwgPSBsbShQb3N0c2Vhc29uIH4gR2FtZXNQbGF5ZWQgKyBHYW1lc1dvbiArIFdpbm5pbmdQZXJjZW50YWdlICsgUG93ZXJSYXRpbmcgKyBBZGp1c3RlZFRlbXBvICsgT2ZmZW5zaXZlUmVib3VuZFJhdGVBbGxvd2VkICsgRnJlZVRocm93UmF0ZSwgZGF0YSA9IHExYikKCmFibGluZShxMV9tb2RlbCkKYGBgCgpgYGB7cn0KcGxvdChxMV9tb2RlbCkKYGBgCgpgYGB7cn0KcTIgPC0gcTFbYygyLDIyKV0gCnEyJENvbmZlcmVuY2UgPC0gYXMuZmFjdG9yKHEyJENvbmZlcmVuY2UpCnEyJFBvc3RzZWFzb24gPC0gYXMubnVtZXJpYyhxMiRQb3N0c2Vhc29uKQoKYm94cGxvdChQb3N0c2Vhc29ufmZhY3RvcihDb25mZXJlbmNlKSwgZGF0YT1xMikKCm1lYW5zID0gdGFwcGx5KHEyJFBvc3RzZWFzb24sIHEyJENvbmZlcmVuY2UsIG1lYW4pCnBvaW50cyhtZWFucywgY29sPSJyZWQiLCBwY2g9MTgpCmBgYAoKYGBge3J9CnEyX2Fub3ZhID0gYW92KFBvc3RzZWFzb25+ZmFjdG9yKENvbmZlcmVuY2UpLCBkYXRhPXEyKQpxMl9hbm92YQpwbG90KHEyX2Fub3ZhKQojIHEyX2Fub3ZhIHBsb3Qgc2F0aXNmaWVzIGNyaXRlcmlhCnN1bW1hcnkocTJfYW5vdmEpCiMgUC12YWx1ZSBpcyBsb3cgLT4gcmVqZWN0IHRoZSBudWxsICh0aGF0IGFsbCBtZWFucyBhcmUgdGhlIHNhbWUpLCB3aGljaCBtZWFucyB0aGF0IHRoZSBncm91cHMgYXJlIGRpZmZlcmVudApgYGAKCmBgYHtyLCBldmFsPUZ9CiMgU2VlIHdoaWNoIGdyb3VwcyBhcmUgdGhlIG1vc3QgZGlmZmVyZW50CnBhaXJ3aXNlLnQudGVzdChxMiRQb3N0c2Vhc29uLCBxMiRDb25mZXJlbmNlLCBwLmFkaj0ibm9uZSIpCmBgYAoKYGBge3J9CnEyX3Bsb3QgPSBUdWtleUhTRChxMl9hbm92YSkKcGxvdChxMl9wbG90KQpgYGAKCmBgYHtyfQoKcTM8LSBnZ3Bsb3QoZGF0YT1DQkIpICsKICBnZW9tX3BvaW50KGFlcyh4PVR1cm5vdmVyUmF0ZSx5PUdhbWVzV29uKSkgKwogIGdlb21fc21vb3RoKGFlcyh4PVR1cm5vdmVyUmF0ZSx5PUdhbWVzV29uKSkgKyAKICB4bGFiKCJUdXJub3ZlciBSYXRlIikreWxhYigiR2FtZXMgV29uIikKCnEzCmBgYAoKYGBge3J9CgpxNDwtZ2dwbG90KGRhdGE9ZmlsdGVyKENCQixDb25mZXJlbmNlPT0iQUNDIikpICsKICBnZW9tX3BvaW50KGFlcyh4PVBvd2VyUmF0aW5nLHk9IEdhbWVzV29uL0dhbWVzUGxheWVkKSkrCiAgZ2VvbV9zbW9vdGgoYWVzKHg9UG93ZXJSYXRpbmcseT0gR2FtZXNXb24vR2FtZXNQbGF5ZWQpKSsKICBnZ3RpdGxlKCJQb3dlciBSYXRpbmcgRWZmaWNpZW5jeSBpbiB0aGUgQUNDIikrCiAgeGxhYigiUG93ZXIgUmF0aW5nIikgK3lsYWIoIkF2ZXJhZ2UgV2luIikKCnE0CmBgYAoKCgpgYGB7cn0KcTUgPSBnZ3Bsb3QoZGF0YT1DQkIpICsKICBnZW9tX3BvaW50KGFlcyh4PUZyZWVUaHJvd1JhdGUseT1HYW1lc1dvbikpICsKICB4bGFiKCJGcmVlIFRocm93IFBlcmNlbnRhZ2UiKSt5bGFiKCJHYW1lcyBXb24iKSArIGdlb21fc21vb3RoKGFlcyh4PUZyZWVUaHJvd1JhdGUseT1HYW1lc1dvbikpCnE1CmBgYAoKCmBgYHtyfQpxNiA9IGdncGxvdChkYXRhPUNCQikgKwogIGdlb21fcG9pbnQoYWVzKHg9QWRqdXN0ZWRPZmZlbnNpdmVFZmZpY2llbmN5LHk9IEdhbWVzV29uIC8gR2FtZXNQbGF5ZWQpKSArCiAgeGxhYigiT2ZmZW5zaXZlIEVmZmljaWVuY3kiKSt5bGFiKCJXaW5uaW5nIFBlcmNlbnRhZ2UiKSArIGdlb21fc21vb3RoKGFlcyh4PUFkanVzdGVkT2ZmZW5zaXZlRWZmaWNpZW5jeSx5PSBHYW1lc1dvbiAvIEdhbWVzUGxheWVkKSkKcTYKYGBgCgoKCgpgYGB7cn0KI1Bsb3RzIGZvciBxNwpxN3Bsb3QuMSA9IGdncGxvdChkYXRhPUNCQiwgYWVzKHg9QWRqdXN0ZWRPZmZlbnNpdmVFZmZpY2llbmN5LCB5PVBvd2VyUmF0aW5nKSkgKwogIGdlb21fcG9pbnQoKQoKcThwbG90LjIgPSBnZ3Bsb3QoZGF0YT1DQkIsIGFlcyh4PUFkanVzdGVkRGVmZW5zaXZlRWZmaWNpZW5jeSwgeT1Qb3dlclJhdGluZykpICsKICBnZW9tX3BvaW50KCkKCmdyaWQuYXJyYW5nZShxN3Bsb3QuMSwgcThwbG90LjIsIG5jb2w9MikKYGBgCgpgYGB7cn0KI1Bsb3QgZm9yIHE4CnE4cGxvdC4xID0gQ0JCICU+JQogIGdyb3VwX2J5KENvbmZlcmVuY2UpICU+JQogIHN1bW1hcmlzZShDb3VudCA9IG4oKSwgR2FtZXNXb25BdmcgPSBtZWFuKEdhbWVzV29uKSkgJT4lCiAgYXJyYW5nZShkZXNjKENvdW50KSkgJT4lCiAgZ2dwbG90KGFlcyh4PUNvbmZlcmVuY2UsIHk9R2FtZXNXb25BdmcpKSArCiAgICBnZW9tX2JhcihzdGF0PSdpZGVudGl0eScpCgpxOHBsb3QuMQpgYGAKCgoKYGBge3J9CiNQbG90IGZvciBxOQpxOSA8LSBnZ3Bsb3QoZGF0YT1DQkIsIGFlcyh4PVR3b1BvaW50U2hvb3RpbmdQZXJjZW50YWdlQWxsb3dlZCx5PVRocmVlUG9pbnRTaG9vdGluZ1BlcmNlbnRhZ2VBbGxvd2VkKSkgKyAKICBnZW9tX3BvaW50KCkrCiAgc3RhdF9yZWdsaW5lX2VxdWF0aW9uKGxhYmVsLnkgPSA0MCwgYWVzKGxhYmVsID0gLi5yci5sYWJlbC4uKSkKCnE5CmBgYAoKCmBgYHtyfQojUGxvdCBmb3IgcTEwCnExMC4xIDwtIGdncGxvdChkYXRhPUNCQiwgYWVzKHg9VGhyZWVQb2ludFNob290aW5nUGVyY2VudGFnZSx5PUFkanVzdGVkT2ZmZW5zaXZlRWZmaWNpZW5jeSkpICsgCiAgZ2VvbV9wb2ludCgpICsgCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgc2UgPSBGQUxTRSkgKwogIHN0YXRfcmVnbGluZV9lcXVhdGlvbihsYWJlbC55ID0gMTIwLCBhZXMobGFiZWwgPSAuLnJyLmxhYmVsLi4pKQogICAgICAgICAgIAoKcTEwLjIgPC0gZ2dwbG90KGRhdGE9Q0JCLCBhZXMoeD1Ud29Qb2ludFNob290aW5nUGVyY2VudGFnZSwgeT1BZGp1c3RlZE9mZmVuc2l2ZUVmZmljaWVuY3kpKSArIAogIGdlb21fcG9pbnQoKSArIAogIGdlb21fc21vb3RoKG1ldGhvZCA9ICJsbSIsIHNlID0gRkFMU0UpICsKICBzdGF0X3JlZ2xpbmVfZXF1YXRpb24obGFiZWwueSA9IDEyMCwgYWVzKGxhYmVsID0gLi5yci5sYWJlbC4uKSkKICAKCmdyaWQuYXJyYW5nZShxMTAuMSwgcTEwLjIsIG5jb2w9MSkKYGBgCgpgYGB7cn0KcTExPC1nZ3Bsb3QoZGF0YT1DQkIpICsKICBnZW9tX3BvaW50KGFlcyh4PVRocmVlUG9pbnRTaG9vdGluZ1BlcmNlbnRhZ2UseT0gR2FtZXNXb24pKSsKICBnZW9tX3Ntb290aChhZXMoeD1UaHJlZVBvaW50U2hvb3RpbmdQZXJjZW50YWdlLHk9IEdhbWVzV29uKSkrCiAgZ2d0aXRsZSgiQ29ycmVsYXRpb24gQmV0d2VlbiAzUCBTaG9vdGluZyBQZXJjZW50YWdlIGFuZCBHYW1lcyBXb24iKSsKICB4bGFiKCIzUCBTaG9vdGluZyAlIikgK3lsYWIoIkdhbWVzIFdvbiIpCnExMQpgYGAKCmBgYHtyfQpxMTI8LWdncGxvdChkYXRhPUNCQikgKwogIGdlb21fcG9pbnQoYWVzKHg9VHdvUG9pbnRTaG9vdGluZ1BlcmNlbnRhZ2VBbGxvd2VkLHk9IEZyZWVUaHJvd1JhdGVBbGxvd2VkKSkrCiAgZ2VvbV9zbW9vdGgoYWVzKHg9VHdvUG9pbnRTaG9vdGluZ1BlcmNlbnRhZ2VBbGxvd2VkLHk9IEZyZWVUaHJvd1JhdGVBbGxvd2VkKSkrCiAgZ2d0aXRsZSgiQ29ycmVsYXRpb24gQmV0d2VlbiBBbGxvd2VkIDJwdCBTaG9vdGluZyAlIGFuZCBGcmVlIFRocm93IFJhdGUgQWxsb3dlZCIpKwogIHhsYWIoIjJwdCBTaG90IFNob290aW5nIFBlcmNlbnRhZ2UgQWxsb3dlZCIpICt5bGFiKCJGcmVlIFRocm93IFJhdGUgQWxvd2VkIikKcTEyCgpgYGAKClBvc3NpYmxlIGdyYXBoIGZvciBxdWVzdGlvbiA0OgpEbyBtb3JlIHdpbnMgb3ZlciBidWJibGUgdGVhbXMgbGVhZCB0byBoaWdoZXIgc2VlZHMgaW4gTWFyY2ggTWFkbmVzcz8KYGBge3J9CkJ1YmJsZXdpbnMgPSBnZ3Bsb3QoZGF0YT1DQkIpICsKICBnZW9tX3BvaW50KGFlcyh4PVdpbnNBYm92ZUJ1YmJsZSx5PVNlZWQsIGNvbG9yID0gQ29uZmVyZW5jZSkpICsKICB4bGFiKCJXaW5zIEFib3ZlIEJ1YmJsZSIpICsgeWxhYigiU2VlZCIpICsgZ2VvbV9zbW9vdGgoYWVzKHg9V2luc0Fib3ZlQnViYmxlLCB5ID0gU2VlZCkpCkJ1YmJsZXdpbnMKYGBgCgoKCgojID4+Pj4+Pj4gMzJhOTBiNGQ4MTg4ZjNjNzUyYjZiMjI0YjViNWU4YjlhOGUxMjhkMAo=
=======
---
title: "EDA Data Cleanup"
output: html_notebook
---

```{r}
library(readr)
library(tidyverse)
library(gridExtra)
library(ggpubr)
library(ggplot2)
library(gganimate)
library(gifski)
```

```{r}
CBB = read_csv("cbb.csv")
names(CBB)
CBB <- CBB %>% 
  rename(School=TEAM) %>% 
  rename(Conference=CONF) %>% 
  rename(GamesPlayed = G) %>% 
  rename(GamesWon = W) %>% 
  rename(AdjustedOffensiveEfficiency=ADJOE) %>% 
  rename(AdjustedDefensiveEfficiency=ADJDE) %>% 
  rename(PowerRating = BARTHAG) %>% 
  rename(EffectiveFieldGoalPercentageShot=EFG_O) %>% 
  rename(EffectiveFieldGoalPercentageAllowed=EFG_D) %>% 
  rename(TurnoverRate=TOR) %>% 
  rename(StealRate=TORD) %>% 
  rename(OffensiveReboundRate=ORB) %>% 
  rename(OffensiveReboundRateAllowed=DRB) %>% 
  rename(FreeThrowRate=FTR) %>%
  rename(FreeThrowRateAllowed = FTRD) %>% 
  rename(TwoPointShootingPercentage = "2P_O") %>% 
  rename(TwoPointShootingPercentageAllowed='2P_D') %>% 
  rename(ThreePointShootingPercentage='3P_O') %>% 
  rename(ThreePointShootingPercentageAllowed='3P_D') %>% 
  rename(AdjustedTempo=ADJ_T) %>% 
  rename(WinsAboveBubble=WAB) %>% 
  rename(Postseason=POSTSEASON) %>% 
  rename(Seed=SEED) %>% 
  rename(Season=YEAR)

```

```{r}
q1 <- filter(CBB, !is.na(Postseason)) %>% 
  mutate(WinningPercentage = GamesWon / GamesPlayed)

q1$Postseason[q1$Postseason == "Champions"] <- 1
q1$Postseason[q1$Postseason == "2ND"] <- 2
q1$Postseason[q1$Postseason == "F4"] <- 3
q1$Postseason[q1$Postseason == "E8"] <- 4
q1$Postseason[q1$Postseason == "S16"] <- 5
q1$Postseason[q1$Postseason == "R32"] <- 6
q1$Postseason[q1$Postseason == "R64"] <- 7
q1$Postseason[q1$Postseason == "R68"] <- 8
```

```{r}
q1b = q1[,c(3, 4, 25, 5:24)]
none = lm(Postseason~1, data=q1b)
full = lm(Postseason~., data=q1b)
stepAIC(none, scope=list(upper=full), direction="both", trace=FALSE)
```

```{r}
plot(Postseason ~ GamesPlayed + GamesWon + WinningPercentage + PowerRating + AdjustedTempo + OffensiveReboundRateAllowed + FreeThrowRate, data = q1b)

q1_model = lm(Postseason ~ GamesPlayed + GamesWon + WinningPercentage + PowerRating + AdjustedTempo + OffensiveReboundRateAllowed + FreeThrowRate, data = q1b)

abline(q1_model)
```

```{r}
plot(q1_model)
```

```{r}
q2 <- q1[c(2,22)] 
q2$Conference <- as.factor(q2$Conference)
q2$Postseason <- as.numeric(q2$Postseason)

boxplot(Postseason~factor(Conference), data=q2)

means = tapply(q2$Postseason, q2$Conference, mean)
points(means, col="red", pch=18)
```

```{r}
q2_anova = aov(Postseason~factor(Conference), data=q2)
q2_anova
plot(q2_anova)
# q2_anova plot satisfies criteria
summary(q2_anova)
# P-value is low -> reject the null (that all means are the same), which means that the groups are different
```

```{r, eval=F}
# See which groups are the most different
pairwise.t.test(q2$Postseason, q2$Conference, p.adj="none")
```

```{r}
q2_plot = TukeyHSD(q2_anova)
plot(q2_plot)
```

```{r}

q3<- ggplot(data=CBB) +
  geom_point(aes(x=TurnoverRate,y=GamesWon)) +
  geom_smooth(aes(x=TurnoverRate,y=GamesWon)) + 
  xlab("Turnover Rate")+ylab("Games Won")

q3
```

```{r}

q4<-ggplot(data=filter(CBB,Conference=="ACC")) +
  geom_point(aes(x=PowerRating,y= GamesWon/GamesPlayed))+
  geom_smooth(aes(x=PowerRating,y= GamesWon/GamesPlayed))+
  ggtitle("Power Rating Efficiency in the ACC")+
  xlab("Power Rating") +ylab("Average Win")

q4
```



```{r}
q5 = ggplot(data=CBB) +
  geom_point(aes(x=FreeThrowRate,y=GamesWon)) +
  xlab("Free Throw Percentage")+ylab("Games Won") + geom_smooth(aes(x=FreeThrowRate,y=GamesWon))
q5
```


```{r}
q6 = ggplot(data=CBB) +
  geom_point(aes(x=AdjustedOffensiveEfficiency,y= GamesWon / GamesPlayed)) +
  xlab("Offensive Efficiency")+ylab("Winning Percentage") + geom_smooth(aes(x=AdjustedOffensiveEfficiency,y= GamesWon / GamesPlayed))
q6
```

```{r}
#Plots for q7
q7plot.1 = ggplot(data=CBB, aes(x=AdjustedOffensiveEfficiency, y=PowerRating)) +
  geom_point()

q8plot.2 = ggplot(data=CBB, aes(x=AdjustedDefensiveEfficiency, y=PowerRating)) +
  geom_point()

grid.arrange(q7plot.1, q8plot.2, ncol=2)
```

```{r}
#Plot for q8
q8plot.1 = CBB %>%
  group_by(Conference) %>%
  summarise(Count = n(), GamesWonAvg = mean(GamesWon)) %>%
  arrange(desc(Count)) %>%
  ggplot(aes(x=Conference, y=GamesWonAvg)) +
    geom_bar(stat='identity')

q8plot.1
```



```{r}
#Plot for q9
q9 <- ggplot(data=CBB, aes(x=TwoPointShootingPercentageAllowed,y=ThreePointShootingPercentageAllowed)) + 
  geom_point()+
  stat_regline_equation(label.y = 40, aes(label = ..rr.label..))

q9
```


```{r}
#Plot for q10
q10.1 <- ggplot(data=CBB, aes(x=ThreePointShootingPercentage,y=AdjustedOffensiveEfficiency)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  stat_regline_equation(label.y = 120, aes(label = ..rr.label..))
           

q10.2 <- ggplot(data=CBB, aes(x=TwoPointShootingPercentage, y=AdjustedOffensiveEfficiency)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) +
  stat_regline_equation(label.y = 120, aes(label = ..rr.label..))
  

grid.arrange(q10.1, q10.2, ncol=1)
```

```{r}
q11<-ggplot(data=CBB) +
  geom_point(aes(x=ThreePointShootingPercentage,y= GamesWon))+
  geom_smooth(aes(x=ThreePointShootingPercentage,y= GamesWon))+
  ggtitle("Correlation Between 3P Shooting Percentage and Games Won")+
  xlab("3P Shooting %") +ylab("Games Won")
q11
```

```{r}
q12<-ggplot(data=CBB) +
  geom_point(aes(x=TwoPointShootingPercentageAllowed,y= FreeThrowRateAllowed))+
  geom_smooth(aes(x=TwoPointShootingPercentageAllowed,y= FreeThrowRateAllowed))+
  ggtitle("Correlation Between Allowed 2pt Shooting % and Free Throw Rate Allowed")+
  xlab("2pt Shot Shooting Percentage Allowed") +ylab("Free Throw Rate Alowed")
q12

```

```{r}
#Code for follow-up q3

ggplot(q1, aes(x=Season, y=AdjustedOffensiveEfficiency, group=Conference, color=Conference)) +
  geom_line() 
  #transition_reveal(Season)
```

```{r}
mod1 = lm(AdjustedOffensiveEfficiency~Season+Conference, data=q1)
summary(mod1)
plot(AdjustedOffensiveEfficiency~Season+Conference, data=q1)
abline(mod1)
```

# >>>>>>> 32a90b4d8188f3c752b6b224b5b5e8b9a8e128d0

>>>>>>> Stashed changes